library(ggplot2)
library(heatmaply)
library(Rtsne)
data = read.csv('geneExpression_GDSC.csv', row.names = "X")
data
summary(data$GeneralType)
##    aero_dig_tract              bone            breast  digestive_system 
##                77                35                49                48 
##            kidney   large_intestine          leukemia              lung 
##                33                45                76                22 
##        lung_NSCLC         lung_SCLC          lymphoma           myeloma 
##               108                55                65                17 
##    nervous_system     neuroblastoma          pancreas              skin 
##                55                31                30                53 
##       soft_tissue           thyroid urogenital_system              NA's 
##                19                16               100                 1
summary(data$Type)
##            acute_myeloid_leukaemia                      adrenal_gland 
##                                 24                                  1 
##     anaplastic_large_cell_lymphoma                    B_cell_leukemia 
##                                  3                                 12 
##                    B_cell_lymphoma                      biliary_tract 
##                                 31                                  5 
##                            Bladder                         bone_other 
##                                 19                                  2 
##                             breast                   Burkitt_lymphoma 
##                                 49                                 13 
##                             cervix                     chondrosarcoma 
##                                 14                                  3 
##          chronic_myeloid_leukaemia             digestive_system_other 
##                                 10                                  1 
##                        endometrium                     ewings_sarcoma 
##                                 11                                 21 
##                       fibrosarcoma                             glioma 
##                                  2                                 51 
##      haematopoietic_neoplasm other               hairy_cell_leukaemia 
##                                  6                                  3 
##                      head and neck                   Hodgkin_lymphoma 
##                                 42                                  9 
##                             kidney                    large_intestine 
##                                 32                                 45 
##                           leukemia                              liver 
##                                  3                                 14 
##          lung_NSCLC_adenocarcinoma               lung_NSCLC_carcinoid 
##                                 65                                  4 
##              lung_NSCLC_large cell           lung_NSCLC_not specified 
##                                 13                                 11 
## lung_NSCLC_squamous_cell_carcinoma                         Lung_other 
##                                 15                                  1 
##          lung_small_cell_carcinoma             lymphoblastic_leukemia 
##                                 55                                 11 
##     lymphoblastic_T_cell_leukaemia            lymphoid_neoplasm other 
##                                  8                                 10 
##                    medulloblastoma                           melanoma 
##                                  4                                 50 
##                       mesothelioma                            myeloma 
##                                 21                                 12 
##                      neuroblastoma                         oesophagus 
##                                 31                                 35 
##                       osteosarcoma                              ovary 
##                                  9                                 41 
##                           pancreas                           prostate 
##                                 30                                  7 
##                   rhabdomyosarcoma                         skin_other 
##                                  9                                  3 
##                  soft_tissue_other                            stomach 
##                                  8                                 28 
##                    T_cell_leukemia                             testis 
##                                  3                                  1 
##                            thyroid            urogenital_system_other 
##                                 16                                  4 
##                             uterus                               NA's 
##                                  3                                  1

Visualize your data with ggplot

You can find a cheat sheet ;). https://rstudio.com/resources/cheatsheets/

Let’s create a bar chat showing the number of cell lines per tissue type.

ggplot(data, aes(x=GeneralType, fill=GeneralType)) + geom_bar() +
  theme(axis.text.x = element_text(angle=90, hjust=1,vjust=1))

# what happens if you remove theme?

Let’s try another plot that takes two variables.

ggplot(data, aes(x=GeneralType, y=JUN, fill=GeneralType)) + geom_boxplot() + #geom_point() +
  theme(axis.text.x = element_text(angle=90, hjust=1,vjust=1))

# you can also try other gene?

Clustering analysis

set.seed(1000) # to make sampling reproducible

filter_cell <- data$GeneralType %in% c('breast', 'lung')
filter_gene <- c('Type', 'GeneralType', sample(colnames(data),50))
heatmaply(data[filter_cell, filter_gene], column_text_angle = 90,
          hclust_method = "average") %>% layout(width=1000, height=900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

dimensionality reduction (PCA)

filter_cell <- data$GeneralType %in% c('breast', 'lung', 'pancreas')
PCA = prcomp(data[filter_cell, 3:ncol(data)]) # selecting all numeric columns

barplot(PCA$sdev[1:10]^2, las=2) # amount of variance captured by first 10 components

PCA$x[1:10,1:5]
##                 PC1       PC2        PC3         PC4        PC5
## AU565    -13.724367 -1.871347 -5.3455675   1.9187103 -2.8918538
## BT-20     -5.445793 -5.249889  0.9582700  -1.8191926 -2.2218144
## BT-474   -15.100586  1.672376 -4.5099641  -1.5250846  2.0464121
## BT-483   -17.661129  2.925127  0.4103088   0.5627541  0.5346582
## BT-549     7.192831 12.845869  1.8666512  -3.2586385 -3.1331359
## CAL-120    7.303171  6.230816  2.3337983   1.5738670  4.7528825
## CAL-148  -16.637455  5.462866 -0.1385591   3.0094828  1.4583948
## CAL-51    -0.162678  3.928606  6.1819198   0.4521046  0.6904870
## CAL-85-1  10.393089 -6.561907 -1.3468198 -10.0114684 -4.4460762
## CAMA-1   -17.180801  2.735580 -5.9790370  -0.6497663  1.3056740
df <- as.data.frame(PCA$x)
df$Type = data$GeneralType[filter_cell]

ggplot(df, aes(x=PC1, y=PC2, col=Type)) + geom_point()

clustering with the compressed data

heatmaply(df[,c("PC1", "PC2", "Type")], column_text_angle = 90,
          hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Another dimensionality reduction : t-sne

Unlike PCA, t-sne takes local associations than global associations. (only the closest neighbors considered) - perplexity parameter controls the number of closest neighbor considered.

Also, t-sne is not reproducible while PCA is. However, the algorithm gives visually pleasing outcome.

filter_cell <- data$GeneralType %in% c('breast', 'lung', 'pancreas')
tsne = Rtsne(data[filter_cell, 3:ncol(data)], dims=2, perplexity = 30, max_iter=5000
            ) # selecting all numeric columns

df <- as.data.frame(tsne$Y)
df$Type = data$GeneralType[filter_cell]

ggplot(df, aes(x=V1, y=V2, col=Type)) + geom_point()

clustering with the compressed data

heatmaply(df, column_text_angle = 90,
          hclust_method = "average") %>% layout(width=500, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()